import pandas as pd
# numpy: numerical pyton
import numpy as np
## import panguins
penguins = pd.read_csv("data/penguins.csv")
# summarise dataframe
penguins.describe() # just number columns
# summarise dataframe all columns
penguins.describe(include='all')
# average, mean
penguins['bill_length_mm'].mean()
43.9219298245614
# std
penguins['bill_length_mm'].std()
5.4595837139265315
# median
penguins['bill_length_mm'].median()
44.45
# group by + sum / mean
# penguins[ penguins['species'] == 'Adelie' ]['bill_length_mm'].mean() # slowly and does not work
#####
# penguins.groupby('species')['bill_length_mm'].mean()
penguins.groupby('species')['bill_length_mm'].median()
# group by aggrate function
penguins.groupby('species')['bill_length_mm'].agg([ 'min', 'mean', 'median', 'std', 'max' ])
# group by more than one columns
penguins.groupby([ 'island', 'species' ])['bill_length_mm'].agg([ 'min', 'mean', 'max' ])
# group by more than one columns then change display and import to .csv
result = penguins.groupby([ 'island', 'species' ])['bill_length_mm'].agg([ 'min', 'mean', 'max' ]).reset_index()
result.to_csv('result.csv')
result
# if your code is long ~> use \ for new line
penguins.groupby([ 'island', 'species' ])['bill_length_mm'] \
.agg([ 'min', 'mean', 'max' ]) \
.reset_index()
# map values MALE: m, FEMALE: f
# penguins['sex'].head()
penguins['sex_new'] = penguins['sex'].map( { 'MALE': 'm', 'FEMALE': 'f' } ).fillna('other')
penguins.head()
# pandas style
penguins['bill_length_mm'].mean()
43.9219298245614
# numpy style
np.mean(penguins['bill_length_mm'])
43.9219298245614
# other functions of numpy
print( np.sum(penguins['bill_depth_mm']) )
print( np.std(penguins['body_mass_g']) )
5865.700000000001
800.7812292384522
800.7812292384522
# condition
score = pd.Series( [ 80, 55, 62, 95, 20] )
print( score )
0 80
1 55
2 62
3 95
4 20
dtype: int64
1 55
2 62
3 95
4 20
dtype: int64
grade = np.where( score >= 80, 'passed', 'failed' )
print( grade )
['passed' 'failed' 'failed' 'passed' 'failed']
df = penguins.query("species == 'Adelie'")[ ['species', 'island', 'bill_length_mm'] ].dropna()
df.head()
df[ 'new_column' ] = np.where(df['bill_length_mm'] > 40, True, False) # boolean
df.head(10)
# merge dataframe
left = {
'key': [ 1, 2, 3, 4 ],
'name': [ 'toy', 'joe', 'jane', 'anna' ],
'age': [ 25 ,28, 30, 22 ]
}
right = {
'key': [ 1, 2, 3, 4 ],
'city': [ 'Bangkok', 'London', 'Seoul', 'Tokyo' ],
'zip': [ 1001, 2504, 2094, 9802 ]
}
df_left = pd.DataFrame(left)
df_right = pd.DataFrame(right)
df_left
df_right
df_result = pd.merge(df_left, df_right, on='key')
df_result
# histrogram one column
penguins['body_mass_g'].plot(kind='hist'); # hide <Axes: ylabel='Frequency'> ~> use semicolon
# histrogram two columns
penguins[ ['body_mass_g', 'bill_length_mm'] ].plot(kind='hist', bins=30);
penguins[ ['bill_length_mm'] ].plot(kind='hist', bins=30, color='orange');
# bar plots for species
penguins['species'].value_counts().plot(kind='bar', color=['salmon', 'orange', 'gold']);
# scatter plot
penguins[ ['bill_length_mm', 'bill_depth_mm'] ] \
.plot(x='bill_length_mm', y='bill_depth_mm', kind='scatter', color='orange');
# datalore visualization ~> select tab visualize
penguins